Extract the full dataset for all four books together

library(gutenbergr)
library(tidyverse)
all_books_df <- gutenberg_download(c(219,844,43,4300),
                                  mirror="http://mirrors.xmission.com/gutenberg/")

Tokenize the dataset of all four books together using the unnest_tokens command and create a tidy dataset

library(tidytext)
all_tokens <- all_books_df  %>% 
  mutate(line = row_number()) %>% 
  unnest_tokens(word, text)
all_tokens %>% count(word, sort = TRUE)

Remove the stop words

all_tidybook <- all_tokens %>%  anti_join(get_stopwords(source = "smart"),by = "word")
all_tidybook %>% count(word, sort = TRUE)

Plot the top 20 common words in the collection of books

Analysis of each book seperately

Tokenize each book and create tidy datasets making sure that you use stop words

library(gutenbergr)
library(dplyr)
library(tidytext)

heartofdark_tidy <- gutenberg_download(219, mirror="http://mirrors.xmission.com/gutenberg/") %>% 
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
heartofdark_tidy %>% count(word,sort = TRUE)
earnest_tidy <- gutenberg_download(844,mirror="http://mirrors.xmission.com/gutenberg/") %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
earnest_tidy %>% count(word,sort = TRUE)
jekyllandhyde_tidy <- gutenberg_download(43,mirror="http://mirrors.xmission.com/gutenberg/") %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
jekyllandhyde_tidy %>% count(word,sort = TRUE)
ulysses_tidy <- gutenberg_download(4300,mirror="http://mirrors.xmission.com/gutenberg/") %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
ulysses_tidy %>% count(word,sort = TRUE)

Create tidy data sets for each book by author making sure that you use the unnest_tokens command and stop words

Extract IDs for books by each author
#Works by Joseph Conrad(Heart of Darkness)
gutenberg_conrad <- gutenberg_works(author == "Conrad, Joseph")
conrad_id <- subset(gutenberg_conrad, select = c("gutenberg_id"))
print(conrad_id)
# A tibble: 34 x 1
   gutenberg_id
          <int>
 1          219
 2          220
 3          451
 4          493
 5          494
 6          495
 7          525
 8          527
 9          638
10          687
# ... with 24 more rows
#Works by oscar Wild
gutenberg_wilde <- gutenberg_works(author == "Wilde, Oscar")
wilde_id <- subset(gutenberg_wilde, select = c("gutenberg_id"))
print(wilde_id)
# A tibble: 31 x 1
   gutenberg_id
          <int>
 1          174
 2          301
 3          773
 4          774
 5          790
 6          844
 7          854
 8          873
 9          875
10          885
# ... with 21 more rows
#Works by Robert Louis Stevenson
gutenberg_louis_stevenson <- gutenberg_works(author == "Stevenson, Robert Louis")
louis_stevenson_id <- subset(gutenberg_louis_stevenson, select = c("gutenberg_id"))
print(louis_stevenson_id)
# A tibble: 67 x 1
   gutenberg_id
          <int>
 1           42
 2          120
 3          136
 4          280
 5          281
 6          322
 7          329
 8          343
 9          344
10          372
# ... with 57 more rows
#Works by James Joyce
gutenberg_Joyce <- gutenberg_works(author == "Joyce, James")
joyce_id <- subset(gutenberg_Joyce, select = c("gutenberg_id"))
print(joyce_id)
# A tibble: 4 x 1
  gutenberg_id
         <int>
1         2814
2         2817
3         4217
4         4300
Creating tidy data frames for each author
joseph_conrad <- gutenberg_download(conrad_id$gutenberg_id,
                                    mirror ="http://mirrors.xmission.com/gutenberg/")
tidy_joseph_conrad <- joseph_conrad %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
tidy_joseph_conrad %>% 
  count(word, sort = TRUE)
oscar_Wild <- gutenberg_download(wilde_id$gutenberg_id,
                                 mirror ="http://mirrors.xmission.com/gutenberg/")
tidy_oscar_wilde <- oscar_Wild %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
tidy_oscar_wilde %>% 
  count(word, sort = TRUE)
louis_stevenson <- gutenberg_download(louis_stevenson_id$gutenberg_id,
                                      mirror ="http://mirrors.xmission.com/gutenberg/")
tidy_louis_stevenson <- louis_stevenson %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
tidy_louis_stevenson %>% 
  count(word, sort = TRUE)
james_joyce <- gutenberg_download(joyce_id$gutenberg_id,
                                  mirror ="http://mirrors.xmission.com/gutenberg/")
tidy_james_joyce <- james_joyce %>%
  mutate(line = row_number()) %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")
tidy_james_joyce %>% 
  count(word, sort = TRUE)

Use bind_rows to stack the four datasets and create frequency counts of the word distributions after calculating proportions.

author_bind <- bind_rows(mutate(tidy_joseph_conrad, author = "Joseph Conrad"),
                       mutate(tidy_oscar_wilde, author = "Oscar Wilde"),
                       mutate(tidy_louis_stevenson, author = "Robert Louis Stevenson"),
                       mutate(tidy_james_joyce, author = "James Joyce"))

frequency <- author_bind %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n / sum(n)) %>%
  select(-n) %>%                          #Drop n
  spread(author, proportion) %>%          #Reshape long dataset into wide
  gather(author, proportion, `Joseph Conrad`:`Oscar Wilde`:`Robert Louis Stevenson`)

names(frequency)
[1] "word"        "James Joyce" "author"      "proportion" 
tabyl(frequency$author)
head(frequency)

Create word frequency plots for each of the three authors using James Joyce as the standard as we demonstrated using Jane Austen in the Chapter 1 code

Compute correlations between James Joyce against each of the three other authors. Use the code examples in Chapter 1.

cor.test(data = frequency[frequency$author == "Joseph Conrad",],
         ~ proportion + `James Joyce`)

    Pearson's product-moment correlation

data:  proportion and James Joyce
t = 117.04, df = 16548, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6645450 0.6812178
sample estimates:
      cor 
0.6729669 
cor.test(data = frequency[frequency$author == "Oscar Wilde",], 
         ~ proportion + `James Joyce`)

    Pearson's product-moment correlation

data:  proportion and James Joyce
t = 95.252, df = 14270, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.6133099 0.6333710
sample estimates:
      cor 
0.6234431 
cor.test(data = frequency[frequency$author == "Robert Louis Stevenson",], 
         ~ proportion + `James Joyce`)

    Pearson's product-moment correlation

data:  proportion and James Joyce
t = 144.45, df = 19709, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.7102730 0.7238365
sample estimates:
      cor 
0.7171226 

Do a sentiment analysis of the positive and negative words using the tidy dataset from James Joyce

Build AFINN and combined Bing and nrc sentiment analysis methods using inner join, binding them together and visualizing using ggplot

The plots for all three approaches are comparable, with NRC having the largest positive skew. AFINN and Bing are nearly identical in their ability to distinguish between positive and negative words, while Bing has more negative spikes and longer continuous positive and negative word lengths.

The most positive emotion is shown by NRC, while the highest negative sentiment is shown by AFINN.

In NRC and Bing, we’ll examine at the contribution of positive and negative words because they have the most volatility.

get_sentiments("nrc") %>% 
  filter(sentiment %in% c("positive", "negative")) %>% 
  count(sentiment)
get_sentiments("bing") %>% 
  count(sentiment)

Negative terms are more common than positive ones in both cases, but the ratio of negative to positive words is larger in Bing, which makes sense given the plots we observed.

Finally, checking the most common positive and negative words

Analysis on combined tidy dataset

Calculate the tf_idf for all the tokens in the combined dataset. Use the bind_tf-idf function. Use the code in Chapter 3.

#Taking combined tidy data set of all books created in the beginning; all_tidybook
#Adding title column to tidy data
level_key <- c(`219` = "Heart of Darkness", `844` = "Importance of Being Earnest", 
               `43` = "Dr. Jekyll and Mr. Hyde", `4300` = "Ulysses")
all_tidybook <- all_tidybook %>% 
  mutate(title = recode(gutenberg_id, !!!level_key)) %>%
  subset(select=c(1,4,2,3))

#Calculate tf_idf for all books
all_book_tf_idf <- all_tidybook %>%
  count(title, word, sort = TRUE) %>%
  bind_tf_idf(word, title, n)

all_book_tf_idf

Show the top 10 words which have the highest tf_idf for all books

head(all_book_tf_idf %>% arrange(desc(tf_idf)),10)

Plot the tf_idf for each book separately using the combined dataset as column plots. Use the group_by(title)….and the fact_reorder and geom_col syntax for the ggplots.

all_book_tf_idf %>%
  group_by(title) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = title)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~title, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)